{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "import os\n",
    "import codecs\n",
    "from threading import Thread\n",
    "from time import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://dblp.org/xml/release/\"\n",
    "rsp = requests.get(url)\n",
    "download_url = \"https://dblp.org/xml/release/{}\"\n",
    "items = re.findall(r'\"(dblp-\\S+?.xml.gz)\"', rsp.text)\n",
    "rsp.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_item(item):\n",
    "    if os.path.exists(\"./data/dblp/tgz/{}\".format(item)):\n",
    "        return\n",
    "    with requests.get(download_url.format(item), stream=True) as response:\n",
    "        chunk_size = 1024*1024  # 单次请求最大值\n",
    "        content_size = int(response.headers['content-length'])  # 内容体总大小\n",
    "        data_count = 0\n",
    "        with open(\"./data/dblp/tgz/{}\".format(item), \"wb\") as file:\n",
    "            start_time = time()\n",
    "            for data in response.iter_content(chunk_size=chunk_size):\n",
    "                file.write(data)\n",
    "                data_count = data_count + len(data)\n",
    "                now_jd = (data_count / content_size) * 100\n",
    "                ctime = time() - start_time\n",
    "                print(\"\\r 文件下载进度：{:.5f} %, 还需要 {:.2f} seconds [{}]\".format(now_jd,ctime/now_jd*100, \"./data/dblp/tgz/{}\".format(item)), end=\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf ./data/dlpg/tgz/dblp-2017-01-01.xml.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception in thread Thread-20:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-17:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-13:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-7:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-8:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-21:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-4:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-12:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-23:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-10:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-18:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-16:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-15:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-6:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-11:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-19:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception in thread Thread-9:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-5:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-22:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-14:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "60\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception in thread Thread-26:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-25:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n",
      "Exception in thread Thread-24:\n",
      "Traceback (most recent call last):\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/root/app/anaconda3/lib/python3.7/threading.py\", line 865, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"<ipython-input-3-a21f9be91d30>\", line 15, in download_item\n",
      "    print(\"\\r 文件下载进度：%d%, 还需要$f seconds - %s\" % (now_jd, ctime/now_jd/100, \"./data/dlpg/tgz/{}\".format(item)), end=\" \")\n",
      "ValueError: unsupported format character ',' (0x2c) at index 12\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for i in range(40, len(items), 20):\n",
    "    print(i)\n",
    "    start = i\n",
    "    end = i+20 if i+20 <= len(items) else len(items)\n",
    "    jobs = []\n",
    "    for item in items[start:end]:\n",
    "        jobs.append(Thread(target=download_item, args=(item, )))\n",
    "    for job in jobs:\n",
    "        job.start()\n",
    "    for job in jobs:\n",
    "        job.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'dblp-2016-12-01.xml.gz'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "items[41]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
